1. Visualize the total production of honey across years by state. Use color to highlight the west coast (Washington, Oregon, and California) with a different color used for each west coast state.

honey <- import(here("data", "honeyproduction.csv"), setclass = "tbl_df")

regions <- import(here("data", "us_census_bureau_regions_and_divisions.csv"), setclass = "tbl_df") %>% 
  rename(state = `State Code`, `state_complete` = State) %>% 
  rename_with(tolower)

honey_full <- left_join(honey, regions, by = "state")

#Change scale of total production
honey_full$totalprod  <- honey_full$totalprod/1000000

total_prod <- honey_full %>% 
    select(state_complete, totalprod, year) %>% 
    #group_by(state_complete) %>%
    mutate(state_complete = as.factor(state_complete))

We made several attempts to show the change in growth over time. The first version was a heatmap. Highlighting the state data with ‘gghighlight’ obscures any data fluctuations for said state, over years, thus, we only highlighted the labels for the Western states.

states <- unique(total_prod$state_complete)
states <- sort(states)
label_color <- ifelse(states == "Oregon", "#03b800",
                      ifelse(states == "Washington", "#4714ff", 
                             ifelse(states == "California", "#ffbc1f",
                "gray30")))

label_face <- ifelse(states == "Oregon" | states == "Washington" | states == "California",
                "bold",
                "plain")

p1 <- ggplot(total_prod, aes(year, state_complete)) +
  geom_tile(aes(fill = totalprod)) +
  labs(y = "U.S. States", 
       fill = "Total production, 
in millions of lbs.",
       title = "U.S. Honey Production",
         subtitle = "Total production of honey across years by state",
         caption = "Data: #tidytuesday") +
  #gghighlight(state_complete %in% c("California", "Oregon", "Washington")) +
  theme(axis.text.y = element_text(color = label_color,face = label_face))+
  scale_fill_viridis_c(option = "magma")

p1

Next, we tried using the heatmap with the different arrangement of coordinates.

ggplot(honey_full, aes(state, year, fill = totalprod))+
    geom_tile(color = "white", size = 0.25) +
    labs(title = "U.S. Honey Production",
         subtitle = "Total production of honey across years by state",
         caption = "Data: #tidytuesday") +
    scale_y_continuous(breaks = seq(1998, 2012, 1), expand = c(0, 0))+
    guides(fill = guide_legend(title = "Total Production, 
in millions of lbs.",
                               label.position = "bottom",
                               label.hjust = 1,
                               keywidth = 4, 
                               keyheight = .8))+
    theme(panel.grid.major = element_blank(),
          axis.title = element_blank(), 
          axis.text.x = element_text(color = label_color,face = label_face))

Another way to vizualize the change over time with highlighting the western states, is to have a line plot, with all the states besides Western ones being in the background. Below are versions with 3 Western states together, and with each of them faceted individually.

ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
    geom_line(size = 1)+
  gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
             unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
  labs(y = "Total production, in millions of lbs.",
       title = "U.S. Honey Production",
       subtitle = "Total production of honey across years by state",
       caption = "Data: #tidytuesday")

p2 <- ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
  geom_line(size = 1.5)+
  facet_wrap(~state_complete) +
  gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
              unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
  labs(y = "Total production, in millions of lbs.",
       title = "U.S. Honey Production",
       subtitle = "Total production of honey across years by state",
       caption = "Data: #tidytuesday")

p2

We also made a line plot for each state. Besides highlighting the line for each Western state by different color, we tried using the Label of the state printed in a different style/color as well. However, ggplot was not accepting a vector of colors for ‘element_text()’, and none of the troubleshooting helped.

states <- unique(total_prod$state_complete)
label_color <- ifelse(states == "Oregon", "#03b800",
                      ifelse(states == "Washington", "#4714ff", 
                             ifelse(states == "California", "#ffbc1f",
                "gray30")))

label_face <- ifelse(states == "Oregon" | states == "Washington" | states == "California",
                "bold",
                "plain")

total_prod %>% 
    spread(state_complete, totalprod) %>% 
    gather(state_complete, totalprod, -year) %>% 
    ggplot(aes(year,  totalprod)) +
    geom_line(colour = "grey40", size = 1.5) +
    geom_line(data = filter(total_prod, str_detect(state_complete, "California")),
             color = "#ffbc1f") +
    geom_line(data = filter(total_prod, str_detect(state_complete, "Oregon")),
             color = "#03b800") +
    geom_line(data = filter(total_prod, str_detect(state_complete, "Washington")),
             color = "#4714ff") +
    facet_wrap(~state_complete) +
  theme(strip.text = element_text(color = label_color, face = label_face)) +
    labs(y = "Total production, in millions of lbs.",
         title = "U.S. Honey Production",
         subtitle = "Total production of honey across years by state",
         caption = "Data: #tidytuesday") 

Finally, we have tried creating the dumbbell plot to show the difference between 1998 and 2012 in honey production. That plot is not ideal, since it only shows the change between the initial and final points, without reflecting any fluctuations in between.

dumbbell <- total_prod %>% 
  mutate(year = as.character(year), 
         state_complete = as.factor(state_complete))

dumbbell <- dumbbell %>% 
  filter(year %in% c("1998", "2012"))

ggplot(dumbbell, aes(totalprod, state_complete)) +
  geom_line(aes(group = state_complete), color = "gray40") +
  geom_line(data = filter(dumbbell, str_detect(state_complete, "California")),
             color = "#ffbc1f") +
  geom_line(data = filter(dumbbell, str_detect(state_complete, "Oregon")),
             color = "#015200") +
  geom_line(data = filter(dumbbell, str_detect(state_complete, "Washington")),
             color = "#2600ad") +
  geom_point(aes(color = year)) +
  labs(x = "Total production, in millions of lbs.", 
       y = "U.S. state",
       title = "U.S. Honey Production",
       subtitle = "Change in production between 1998 and 2012 by state",
       caption = "Data: #tidytuesday") +
  theme(axis.text.y = element_text(color = label_color, face = label_face))

2. Reproduce the plot according three different kinds of color blindness, as well as a desaturated version.

We tried to reproduce the color-blindness-friendly versions for one of the heatmaps and one of the faceted plots. The heatmap uses sequential palettes, while the faceted plot uses the qualitative one. The heatmap plot allows the better illustration of the gradation of change over years, and let the different palettes show, while the faceted plot allows for a more effective highlighting of the western states.

colorblindr::cvd_grid(p1)

colorblindr::cvd_grid(p2)

3. Reproduce the plot using a color blind safe palette.

#reminder which sequential palettes are color blind safe
#display.brewer.all(type="seq", colorblindFriendly = TRUE)

p3 <- ggplot(total_prod, aes(year, state_complete)) +
  geom_tile(aes(fill = totalprod)) +
  labs(y = "U.S. States", 
       fill = "Total production, 
in millions of lbs.",
       title = "U.S. Honey Production",
       subtitle = "Total production of honey across years by state",
       caption = "Data: #tidytuesday") +
  theme(axis.text.y = element_text(color = label_color,face = label_face)) +
  scale_fill_continuous_sequential("Blues", begin = 0.25, end = 1) 
  
p3

#to check
colorblindr::cvd_grid(p3)

p4 <- ggplot(total_prod, aes(year, totalprod, color = state_complete)) +
    geom_line(size = 1.5)+
  facet_wrap(~state_complete) +
 gghighlight(state_complete %in% c("California", "Oregon", "Washington"),
             unhighlighted_params = list(size = 0.5, colour = alpha("grey20", 0.2))) +
  scale_color_OkabeIto() + 
  labs(y = "Total production, in millions of lbs.",
         title = "U.S. Honey Production",
         subtitle = "Total production of honey across years by state",
         caption = "Data: #tidytuesday")

p4

#to check
colorblindr::cvd_grid(p4)

4. Produce a bar plot of average honey produced by each state (collapsed across years) with color highlighting region.

  • Download the file here denoting the region and division of each state.
  • Join the file with your honey file.
  • Produce a bar plot displaying the average honey for each state (collapsing across years).
  • Use color to highlight the region of the country the state is from.
  • Note patterns you notice.

(Note: I tried out two different qualitative color palettes for this bar plot.)

It appears that, on average, states in the Midwestern and Western regions of the US produce the most honey and that states in the Northeast region produce the least. States from the Southern region of the US also appear to produce less overall, with the exception of Florida. The states with the most honey production are fairly large in terms of geographic area. It would be interesting to see if there is a relationship between geographic area of a state and honey production (e.g., smaller states are associated with lower honey production). I’m also curious if there is a certain crop grown in North and South Dakota that commercial honey bees forage on (e.g., clover) and if there is a relationship between prevalence of wind-pollinated or self-pollinated crops (e.g., corn, wheat, grass seed, soy) and lower commercial honey production that could explain why some states have lower honey production (in particular, the states in the Midwest that have produce much less honey).

# load data

honey <- import(here("data", "honeyproduction.csv"), setclass = "tbl_df")

regions <- import(here("data", "us_census_bureau_regions_and_divisions.csv"), setclass = "tbl_df") %>% 
  rename(state = `State Code`, `state_complete` = State) %>% 
  rename_with(tolower)

# join honey and regions
honey_regions <- left_join(honey, regions, by = "state")

# evaluated if there were equal observations for each state
state_obs <- honey_regions %>% 
  group_by(state) %>% 
  summarize(n = n())

# calculate average production across years (raw and in millions), grouping by state and including region
honey_st_avg <- honey_regions %>% 
  group_by(state_complete, region) %>% 
  summarize(state_avg_mil = (mean(totalprod)/1000000),
            state_avg = mean(totalprod))
# testing out okabe ito
honey_st_avg %>%   
  ggplot(aes(x = state_avg_mil, 
             y = fct_reorder(state_complete, state_avg_mil),
             fill = region)) +
  geom_col(alpha = .9) +
  scale_fill_OkabeIto() +
  scale_x_continuous(expand = c(0, 0), 
                     breaks = c(0, 10, 20, 30), 
                     labels = c("0", "10 million", "20 million", "30 million")) + 
  labs(title = "US Honey Production by State",
       subtitle = "1998 to 2012",
       y = "State",
       x = "Average Yearly Production (Pounds)",
       caption = "Source: #tidytuesday", 
       fill = "Region") + 
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

# v2 with 'dark 2'
honey_st_avg %>%   
  ggplot(aes(x = state_avg_mil, 
             y = fct_reorder(state_complete, state_avg_mil),
             fill = region)) +
  geom_col(alpha = .9) +
  scale_fill_discrete_qualitative("dark 2") +
  scale_x_continuous(expand = c(0, 0), 
                     breaks = c(0, 10, 20, 30), 
                     labels = c("0", "10 million", "20 million", "30 million")) + 
  labs(title = "US Honey Production by State",
       subtitle = "1998 to 2012",
       y = "State",
       x = "Average Yearly Production (Pounds)",
       caption = "Source: #tidytuesday", 
       fill = "Region") + 
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 

5. Create a heatmap displaying the average honey production across years by region (averaging across states within region).

# calculate average production across years (raw and in millions), grouping by state and including region
honey_avg_region <- honey_regions %>% 
  group_by(region, year) %>% 
  summarize(reg_avg_mil = (mean(totalprod)/1000000),
            reg_avg = mean(totalprod))

honey_avg_region %>% 
  ggplot(aes(year, 
             fct_reorder(region, reg_avg_mil))) +
  geom_tile(aes(fill = reg_avg_mil)) + 
  scale_fill_continuous_sequential(palette = "Red-Yellow", breaks = c(2, 4, 6, 8), labels = c("2 million", "4 million", "6 million", "8 million")) + 
  scale_x_continuous(expand = c(0, 0), breaks = c(1998, 2003, 2008, 2012)) +
  labs(title = "US Regional Honey Production",
       subtitle = "1998 to 2012",
       y = "Region",
       x = "Year",
       caption = "Source: #tidytuesday", 
       fill = "Average Production (Pounds)") +
  theme_minimal() +
  theme(plot.title.position = "plot",
        panel.grid = element_blank())  

6. Create at least one more plot of your choosing using color to distinguish, represent data values, or highlight.

us <- usa_sf()

us <- rename(us, state = iso_3166_2)

df <- right_join(honey, us)

df_tib <- as_tibble(df)

ggplot(data = df_tib, aes(geometry = geometry, 
                                                    fill = priceperlb), 
             alpha = 0.9) + 
    geom_sf(color = "white", size = 0) +
    guides(color = "none") +
        facet_wrap(~year) +
    scale_fill_viridis(name = "Price per pound", 
                                         option = "magma",
                                         label = scales::dollar,
                                         limits = c(0,5)) +
    labs(title = "Price of honey per pound from 1998-2012",
             caption = "Source: #tidytuesday") +
    theme_void()